library(haven)

race_guess <- read_dta("~/Desktop/Temperature study/USCTOfficersToMerge.dta")
race_guess$race[race_guess$gen_race == "B"] <- "black"
race_guess$race[race_guess$gen_race == "W"] <- "white"

vars <- c("recidnum","a_age","a_temp","a_weight","a_hgtft","a_hgtin","a_xmdate","u_nphrts","k_goiter","u_cystts","race","best_birth_year","current_fever","state")


dataVet <- read.csv('~/Desktop/Temperature study/dis_union_army_whites_csv.csv')
dataVet$race <- "white"
dataVetBasics <- read.csv('~/Desktop/Temperature study/The_Basics_unionarmy.csv')
#dataVet$state <- as.character(dataVetBasics $gen_rb_statbest[match(dataVet$recidnum, dataVetBasics$recidnum)])
dataVet$state <- as.character(dataVet$a_brdst)
dataVet$best_birth_year <- dataVetBasics $gen_bestbirthyear[match(dataVet$recidnum, dataVetBasics$recidnum)]
dataVet$current_fever <- dataVet$i_cur1 == "AC YES" | dataVet $i_cur2 == "AC YES" | dataVet $i_cur3 == "AC YES" | dataVet $i_cur4 == "AC YES"


dataVetBlacks <- read.csv('~/Desktop/Temperature study/dis_usct_blacks_csv.csv') 
dataVetBlacks$race <- race_guess$race[match(dataVetBlacks$recidnum, race_guess$recidnum)]
dataVetBlacks$race[is.na(dataVetBlacks$race)] <- "black" 
dataVetBlacksBasics <- read.csv('~/Desktop/Temperature study/The_Basics_usct.csv')
dataVetBlacks$state <- as.character(dataVetBlacksBasics$gen_rb_statbest[match(dataVetBlacks $recidnum, dataVetBlacksBasics$recidnum)])
dataVetBlacks$best_birth_year <- dataVetBlacksBasics$gen_bestbirthyear[match(dataVetBlacks $recidnum, dataVetBlacksBasics$recidnum)]
dataVetBlacks$current_fever <- dataVetBlacks $i_cur1 == "AC YES" | dataVetBlacks $i_cur2 == "AC YES" | dataVetBlacks $i_cur3 == "AC YES"


dataVetBlacksExp <- read.csv('~/Desktop/Temperature study/dis_expanded_usct_blacks_csv.csv')
dataVetBlacksExp$race <- race_guess$race[match(dataVetBlacksExp$recidnum, race_guess$recidnum)]
dataVetBlacksExp$race[is.na(dataVetBlacksExp$race)] <- "black" 
dataVetBlacksExpBasics <- read.csv('~/Desktop/Temperature study/The_Basics_expandedusct.csv')
dataVetBlacksExp$state <- as.character(dataVetBlacksExpBasics$gen_rb_statbest[match(dataVetBlacksExp$recidnum, dataVetBlacksExpBasics$recidnum)])
dataVetBlacksExp$best_birth_year <- dataVetBlacksExpBasics$gen_bestbirthyear[match(dataVetBlacksExp$recidnum, dataVetBlacksExpBasics$recidnum)]
dataVetBlacksExp$current_fever <- dataVetBlacksExp $i_cur1 == "AC YES" | dataVetBlacksExp $i_cur2 == "AC YES" | dataVetBlacksExp $i_cur3 == "AC YES" | dataVetBlacksExp $i_cur4 == "AC YES"


dataVeterans <- rbind(dataVet[,vars], dataVetBlacks[,vars], dataVetBlacksExp[,vars])
colnames(dataVeterans) <- c("study_ID","age","temp","weightLB","heightFT","heightIN","examdate","nephritis","goiter","cystitis","race","best_birth_year","current_fever","state")

dataVeterans$state[!(dataVeterans$state %in% c("AK","AL","AR","AZ","CA","CO","CT","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"))] <- NA
dataVeterans$state <- as.factor(dataVeterans$state)


dataVeterans$state_type <- rep(NA,length(dataVeterans$state))
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("LA","FL","HI")] <- "hot"
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("AZ","TX","AR","MS","AL","GA","SC")] <- "warm"
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("CA","OK","KY","TN","NC","VA")] <- "moderate"
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("NM","KS","MO","IL","IN","OH","WV","DE","MD","NJ")] <- "moderately_cold"
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("WA","SD","OR","NV","UT","CO","NE","IA","PA","NY","MA","CT","RI")] <- "cold"
dataVeterans$state_type[!is.na(dataVeterans$state) & dataVeterans$state %in% c("AK","ID","MT","WY","ND","MN","WI","MI","ME","NH","VT")] <- "very cold"

dataVeterans$state_type <- as.factor(dataVeterans$state_type)



dataVeterans$goiter <- dataVeterans $goiter == "AC YES"
dataVeterans$goiter[is.na(dataVeterans$goiter)] <- FALSE

dataVeterans$nephritis <- dataVeterans $nephritis == "AC YES"
dataVeterans$nephritis[is.na(dataVeterans$nephritis)] <- FALSE

dataVeterans$cystitis <- dataVeterans $cystitis == "AC YES"
dataVeterans$cystitis[is.na(dataVeterans$cystitis)] <- FALSE

dataVeterans$current_fever[is.na(dataVeterans$current_fever)] <- FALSE



dataVeterans$race <- as.factor(dataVeterans$race)

#dataVeterans$weightLB[dataVeterans$weightLB <30] <- dataVeterans$weightLB [dataVeterans$weightLB <30 ] * 10
dataVeterans$weightKG <-dataVeterans$weightLB*0.453592

dataVeterans$heightCM <- ((pmax(dataVeterans$heightFT*12,0,na.rm=TRUE) + pmax(dataVeterans$heightIN,0,na.rm=TRUE))*2.54)
dataVeterans$BMI <-dataVeterans$weightKG/((dataVeterans$heightCM*0.01)^2)

dataVeterans$examyear <- as.numeric(substr(dataVeterans$examdate,1,4))
dataVeterans$examyear[dataVeterans$examyear == 1982] <- 1892

dataVeterans$exammonth <- factor(substr(dataVeterans$examdate,5,6),levels=c("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"))
dataVeterans$examdate <- as.Date(as.character(dataVeterans$examdate),"%Y%m%d" )


dataVeterans$birth_year <- dataVeterans$examyear - dataVeterans$age

dataVeterans$birth_year[!is.na(dataVeterans$best_birth_year)] <- dataVeterans$best_birth_year[!is.na(dataVeterans$best_birth_year)]



#examyear break into decades
dataVeterans$birth_cohort[ dataVeterans$birth_year <1790 &dataVeterans$birth_year >=1780] <-"1780s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1800 &dataVeterans$birth_year >=1790] <-"1790s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1810 &dataVeterans$birth_year >=1800] <-"1800s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1820 &dataVeterans$birth_year >=1810] <-"1810s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1830 &dataVeterans$birth_year >=1820] <-"1820s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1840 &dataVeterans$birth_year >=1830] <-"1830s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1850 &dataVeterans$birth_year >=1840] <-"1840s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1860 &dataVeterans$birth_year >=1850] <-"1850s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1870 &dataVeterans$birth_year >=1860] <-"1860s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1880 &dataVeterans$birth_year >=1870] <-"1870s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1890 &dataVeterans$birth_year >=1880] <-"1880s"
dataVeterans$birth_cohort[ dataVeterans$birth_year <1900 &dataVeterans$birth_year >=1890] <-"1890s"

dataVeterans$birth_cohort <- as.factor(dataVeterans$birth_cohort)
dataVeterans$temp_C <- (dataVeterans$temp - 32)/1.8


#vars_disease <- c("i_cur1","i_cur2","i_cur3","i_cur4",
#	"p_lrs1".."p_lrs10",i_fev[1-4] , i_eff101...i_eff426, u_tst01 ...u_tst26, l_cau1...l_cau10, g_skn  - "AC SMALLPOX ",  q_ind1, q_ind2, q_ind3, q_ind4) for definition in a separate gastrolist )

# This takes disease identifiers from all of the columns that we find relevant and builds a tall table linking each row in the dataVeterant table to associated diseases
# Some values contain multiple diseases (usually each defined by AC string), we assign them all as separate diseases
# We also strip all of the "middle modifiers", defined by MM (e.g. saying that the pneumonia was on the left side) to keep the number of distinct diseases at minimum
#diseases <- data.frame(row=as.numeric(NULL),disease=as.character(NULL),var=as.character(NULL))

diseases <- list()

offset <- 0
idx <- 1

for (dataset in list(dataVet, dataVetBlacks, dataVetBlacksExp)) {
	cols <- names(dataset)[substr(names(dataset),1,5) == "p_lrs"]	
	cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "i_fev"])
	cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "i_eff"])
	cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "u_tst"])
	#cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "l_cau"])
	cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "q_ind"])
	cols <- c(cols,names(dataset)[substr(names(dataset),1,5) == "g_skn"])
	
	for (cc in cols) {
		col <- dataset[,cc]
		subs <- which(trimws(as.character(col)) != "")
		for (ii in subs) {
			string <- as.character(col[ii])
			string <- substring(string,4,nchar(string)) # removing the first "AC "
			main <- strsplit(string," AC ")
			for (mm in main[[1]]) {
				core <- (strsplit(mm," MM "))[[1]][1]
				core <- (strsplit(core," SM "))[[1]][1]
				diseases[[idx]] <- data.frame(row=ii+offset,disease= trimws(core),var=cc)
				#diseases <- rbind(diseases,data.frame(row=ii+offset,disease= trimws(core),var=cc))
				idx <- idx+1
			}
		}
	}
	offset <- offset + length(dataset$recidnum)		
}

diseases <- do.call(rbind, diseases)

diseases$disease <- as.factor(diseases$disease)
diseases$var <- as.factor(diseases$var)

#new_levels <- as.character(levels(diseases$disease))[order(as.character(levels(diseases$disease)))]



disease_definitions <- read.csv("~/Desktop/Temperature study/UACWV infectious conditions all.csv")

GI_definitions<-read.csv("~/Desktop/Temperature study/gastrointestinal for mira.csv")

gastro_def <- as.character(GI_definitions[GI_definitions $"Fever.associated."==1,1])
dataVeterans$gastro <- FALSE
dataVeterans$gastro[unique(diseases$row[diseases$disease %in% gastro_def])] <- TRUE



tb_def <- as.character(disease_definitions[disease_definitions$TB==1,1])
tb_def <- unique(c(tb_def, "PULMONARY TUBERCULOSIS","TUBERCULOSIS","TUBERCULAR DEPOSITS", "TUBERCULAR DISEASE", "COMMUNICATIONS ARE TAKING PLACE WITH TUBERCULAR CAVITIES", "TUBERCULAR ADHESIONS BOTH LUNGS", "TUBUCULAR DEPOSITS", "TUBERCLES", "PHTHISIS PULMONALIS", "PULMONARY PHTHISIS" , "PHTHISIS", "FIBROID PHTHISIS", "INCIPIENT PHTHISIS", "HEMOPTYSIS"))
dataVeterans$tb <- FALSE
dataVeterans$tb[unique(diseases$row[diseases$disease %in% tb_def])] <- TRUE


malaria_def <- as.character(disease_definitions[disease_definitions$Malaria==1,1])
dataVeterans$malaria <- FALSE
dataVeterans$malaria[unique(diseases$row[diseases$disease %in% malaria_def ])] <- TRUE

typhoid_def <- as.character(disease_definitions[disease_definitions$Typhoid==1,1])
dataVeterans$typhoid <- FALSE
dataVeterans$typhoid[unique(diseases$row[diseases$disease %in% typhoid_def])] <- TRUE


plague_def <- as.character(disease_definitions[disease_definitions$Plague==1,1])
dataVeterans$plague <- FALSE
dataVeterans$plague[unique(diseases$row[diseases$disease %in% plague_def])] <- TRUE


typhus_def <- as.character(disease_definitions[disease_definitions$Tyhpus==1,1])
dataVeterans$typhus <- FALSE
dataVeterans$typhus[unique(diseases$row[diseases$disease %in% typhus_def])] <- TRUE


syphilis_def <- as.character(disease_definitions[disease_definitions$Syphilis==1,1])
dataVeterans$syphilis <- FALSE
dataVeterans$syphilis[unique(diseases$row[diseases$disease %in% syphilis_def])] <- TRUE


fever_unspec_def <- as.character(disease_definitions[disease_definitions$Fever.unspec==1,1])
dataVeterans$fever_unspec <- FALSE
dataVeterans$fever_unspec[unique(diseases$row[diseases$disease %in% fever_unspec_def])] <- TRUE


pneumonia_def <- as.character(disease_definitions[disease_definitions$Pneumonia==1,1])
pneumonia_def <- unique(c(pneumonia_def,c("PNEUMONIA", "HAD PNEUMONIA", "PLEURO-PNEUMONIA", "PLEUROPNEUMONIA", "BRONCHOPNEUMONIA", "INTERSTITIAL PNEUMONIA", "TYPHOID PNEUMONIA", "FORMER PNEUMONIA" )))
dataVeterans$pneumonia <- FALSE
dataVeterans$pneumonia[unique(diseases$row[diseases$disease %in% pneumonia_def])] <- TRUE


smallpox_def <- as.character(disease_definitions[disease_definitions$Smallpox==1,1])
dataVeterans$smallpox <- FALSE
dataVeterans$smallpox[unique(diseases$row[diseases$disease %in% smallpox_def])] <- TRUE


sepsis_def <- as.character(disease_definitions[disease_definitions$Sepsis==1,1])
dataVeterans$sepsis <- FALSE
dataVeterans$sepsis[unique(diseases$row[diseases$disease %in% sepsis_def])] <- TRUE


osteomyelitis_def <- as.character(disease_definitions[disease_definitions$Osteomyelitis==1,1])
dataVeterans$osteomyelitis <- FALSE
dataVeterans$osteomyelitis[unique(diseases$row[diseases$disease %in% osteomyelitis_def])] <- TRUE

hepatitis_def <- as.character(disease_definitions[disease_definitions$Hepatitis==1,1])
dataVeterans$hepatitis <- FALSE
dataVeterans$hepatitis[unique(diseases$row[diseases$disease %in% hepatitis_def])] <- TRUE

cholera_def <- as.character(disease_definitions[disease_definitions$Cholera==1,1])
dataVeterans$cholera <- FALSE
dataVeterans$cholera[unique(diseases$row[diseases$disease %in% cholera_def])] <- TRUE


dengue_def <- as.character(disease_definitions[disease_definitions$Dengue==1,1])
dataVeterans$dengue <- FALSE
dataVeterans$dengue[unique(diseases$row[diseases$disease %in% dengue_def])] <- TRUE

influenza_def <- as.character(disease_definitions[disease_definitions$Influenza==1,1])
dataVeterans$influenza <- FALSE
dataVeterans$influenza[unique(diseases$row[diseases$disease %in% influenza_def])] <- TRUE


rheumatic_fever_def <- as.character(disease_definitions[disease_definitions$Rheumatic.fever==1,1])
dataVeterans$rheumatic_fever <- FALSE
dataVeterans$rheumatic_fever[unique(diseases$row[diseases$disease %in% rheumatic_fever_def])] <- TRUE


scarlet_fever_def <- as.character(disease_definitions[disease_definitions$Scarlet.fever==1,1])
# Add other definitions if necessary
dataVeterans$scarlet_fever <- FALSE
dataVeterans$scarlet_fever[unique(diseases$row[diseases$disease %in% scarlet_fever_def])] <- TRUE




dataVeterans_subset <-!is.na(dataVeterans$temp) & dataVeterans$temp_C>35 & dataVeterans$temp_C<39 & !is.na(dataVeterans$BMI) & !is.na(dataVeterans$age) & dataVeterans$age>20 & dataVeterans$age<80  & !dataVeterans$current_fever & !is.na(dataVeterans$birth_year) & !is.na(dataVeterans$heightCM) & !is.na(dataVeterans$weightKG) & dataVeterans$heightCM >120 & dataVeterans$heightCM <220 & dataVeterans$weightKG>30 & dataVeterans$weightKG<200 & dataVeterans$birth_year <1860


dataVeterans <- dataVeterans[dataVeterans_subset, ]
write.csv(dataVeterans, '~/Desktop/Temperature study/veterans_processed.csv')


